1 MetabolicExpressR: Metabolic subtyping of patient tumors from gene expression data

This package aims to perform metabolic pathway-based subytping of patient tumor samples from gene expression data.

The main steps are:
1. Perform GSVA on cancer patient gene expression data using KEGG metabolic pathways.
2. Perform k-means clustering on the GSVA matrix, identify metabolic subtypes. Optimal number of k is defined based on data or can be user-specified.
3. Summarize pathway activity per cluster: i.e. mean pathway activity per cluster, or do differential testing.
4. Perform Kaplan-Meier (KM) analysis with clusters if survival data are present.

1.1 Introduction

1.2 Load libraries and test data

# Set wd to current location:
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

########## Load libraries

library("tidyverse")
library("fgsea")
# devtools::install_github('dBenedek/MetabolicExpressR')
library("MetabolicExpressR")

########## Load data

# KEGG metabolic pathways gene set:
kegg_gs <- gmtPathways("../data/kegg_metabolic_human_20211026.gmt")

# CPTAC-HNSCC
gene_exp_cptac_hnscc <- read.table("../data/CPTAC_HNSCC/HS_CPTAC_HNSCC_RNAseq_RSEM_UQ_log2_Normal.cct",
    sep = "\t", stringsAsFactors = F, header = T) %>%
    column_to_rownames("Idx")
clinical_cptac_hnscc <- read.table("../data/CPTAC_HNSCC/HS_CPTAC_HNSCC_CLI.tsi",
    sep = "\t", header = T, stringsAsFactors = F)
clinical_cptac_hnscc <- clinical_cptac_hnscc[-1, ] %>%
    mutate(case_id = str_replace_all(case_id, "-", "\\."), overall_survival = as.numeric(overall_survival),
        overall_free_status = as.numeric(overall_free_status), progression_free_status = as.numeric(progression_free_status),
        progression_free_survival = as.numeric(progression_free_survival)) %>%
    filter(P16 != "Positive (>70% nuclear and cytoplasmic staining)" & case_id %in%
        colnames(gene_exp_cptac_hnscc))

# TCGA-ACC
gene_exp_tcga_acc <- read.table("../data/TCGA_ACC/ACC.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt",
    sep = "\t", stringsAsFactors = F, header = T)
gene_exp_tcga_acc <- gene_exp_tcga_acc[-1, ] %>%
    mutate(Hybridization.REF = str_replace(Hybridization.REF, "\\|\\d+$", "")) %>%
    filter(Hybridization.REF != "?" & Hybridization.REF != "SLC35E2") %>%
    column_to_rownames("Hybridization.REF") %>%
    mutate_if(is.character, as.numeric)
colnames(gene_exp_tcga_acc) <- colnames(gene_exp_tcga_acc) %>%
    str_extract("TCGA\\.[\\w\\d]{2}\\.[\\w\\d]{4}")
clinical_cptac_acc <- read.table("../data/TCGA_ACC/HS_CPTAC_ACC_CLI.tsi", sep = "\t",
    header = T, stringsAsFactors = F) %>%
    column_to_rownames("attrib_name") %>%
    t() %>%
    as.data.frame() %>%
    rownames_to_column("case_id")
clinical_cptac_acc <- clinical_cptac_acc %>%
    mutate(overall_survival = as.numeric(overall_survival), overall_free_status = as.numeric(status))

# TCGA-UVM
gene_exp_tcga_uvm <- read.table("../data/TCGA_UVM/UVM.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt",
    sep = "\t", stringsAsFactors = F, header = T)
gene_exp_tcga_uvm <- gene_exp_tcga_uvm[-1, ] %>%
    mutate(Hybridization.REF = str_replace(Hybridization.REF, "\\|\\d+$", "")) %>%
    filter(Hybridization.REF != "?" & Hybridization.REF != "SLC35E2") %>%
    column_to_rownames("Hybridization.REF") %>%
    mutate_if(is.character, as.numeric)
colnames(gene_exp_tcga_uvm) <- colnames(gene_exp_tcga_uvm) %>%
    str_extract("TCGA\\.[\\w\\d]{2}\\.[\\w\\d]{4}")
clinical_cptac_uvm <- read.table("../data/TCGA_UVM/HS_CPTAC_UVM_CLI.tsi", sep = "\t",
    header = T, stringsAsFactors = F) %>%
    column_to_rownames("attrib_name") %>%
    t() %>%
    as.data.frame() %>%
    rownames_to_column("case_id")
clinical_cptac_uvm <- clinical_cptac_uvm %>%
    mutate(overall_survival = as.numeric(overall_survival), overall_free_status = as.numeric(status))

# TCGA-LAML
gene_exp_tcga_laml <- read.table("../data/TCGA_LAML/LAML.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt",
    sep = "\t", stringsAsFactors = F, header = T)
gene_exp_tcga_laml <- gene_exp_tcga_laml[-1, ] %>%
    mutate(Hybridization.REF = str_replace(Hybridization.REF, "\\|\\d+$", "")) %>%
    filter(Hybridization.REF != "?" & Hybridization.REF != "SLC35E2") %>%
    column_to_rownames("Hybridization.REF") %>%
    mutate_if(is.character, as.numeric)
colnames(gene_exp_tcga_laml) <- colnames(gene_exp_tcga_laml) %>%
    str_extract("TCGA\\.[\\w\\d]{2}\\.[\\w\\d]{4}")
clinical_cptac_laml <- read.table("../data/TCGA_LAML/HS_CPTAC_LAML_CLI.tsi", sep = "\t",
    header = T, stringsAsFactors = F) %>%
    column_to_rownames("attrib_name") %>%
    t() %>%
    as.data.frame() %>%
    rownames_to_column("case_id")
clinical_cptac_laml <- clinical_cptac_laml %>%
    mutate(overall_survival = as.numeric(overall_survival), overall_free_status = as.numeric(status))

# CPTAC-COAD data:
gene_exp_cptac_coad <- read.table("../data/CPTAC_COAD/Human__CPTAC_COAD__UNC__RNAseq__HiSeq_RNA__03_01_2017__BCM__Gene__BCM_RSEM_UpperQuartile_log2.cct",
    sep = "\t", stringsAsFactors = F, header = T) %>%
    column_to_rownames("attrib_name")
# CPTAC-COAD: There is no survival days in survival data

# TCGA-COAD data:
gene_exp_tcga_coad <- read.table(gzfile("../data/TCGA_COAD/Human__TCGA_COADREAD__UNC__RNAseq__HiSeq_RNA__01_28_2016__BI__Gene__Firehose_RSEM_log2.cct.gz"),
    sep = "\t", stringsAsFactors = F, header = T) %>%
    column_to_rownames("attrib_name")
clinical_tcga_coad <- read.table("../data/TCGA_COAD/HS_CPTAC_TCGA_COAD_CLI.tsi",
    sep = "\t", header = T, stringsAsFactors = F) %>%
    column_to_rownames("attrib_name") %>%
    t() %>%
    as.data.frame() %>%
    rownames_to_column("case_id")
clinical_tcga_coad <- clinical_tcga_coad %>%
    mutate(overall_survival = as.numeric(overall_survival), overall_free_status = as.numeric(status))

1.3 Perforom GSVA on gene expression data

GSVA: gene set variation analysis for microarray and RNA-Seq data
This might take some time, especially for larger (n > 100) data sets.

# CPTAC-HNSC data:
gsva_metab_cptac_hnscc <- run_gsva_metabolic(gene_exp_cptac_hnscc, kcdf = "Gaussian",
    kegg_gs = kegg_gs)

# TCGA-ACC data:
gsva_metab_tcga_acc <- run_gsva_metabolic(gene_exp_tcga_acc, kcdf = "Gaussian", kegg_gs = kegg_gs)

# TCGA-UVM data:
gsva_metab_tcga_uvm <- run_gsva_metabolic(gene_exp_tcga_uvm, kcdf = "Gaussian", kegg_gs = kegg_gs)

# TCGA-LAML data:
gsva_metab_tcga_laml <- run_gsva_metabolic(gene_exp_tcga_laml, kcdf = "Gaussian",
    kegg_gs = kegg_gs)

# TCGA-COAD data:
gsva_metab_tcga_coad <- run_gsva_metabolic(gene_exp_tcga_coad, kcdf = "Gaussian",
    kegg_gs = kegg_gs)

# CPTAC-COAD data:
gsva_metab_cptac_coad <- run_gsva_metabolic(gene_exp_cptac_coad, kcdf = "Gaussian",
    kegg_gs = kegg_gs)

1.4 K-means clustering on the GSVA matrix

It is recommended to use the optimal number of k for the clustering step. This is selected based on several indices included in the NbClust R library. Otherwise, you can set the parameters user_def_k = TRUE and k = ... in the kmeans_gsva_metabolic() function.

# CPTAC-HNSC data:
kmeans_clusters_cptac_hnscc <- kmeans_gsva_metabolic(gsva_metab_cptac_hnscc, kegg_gs = kegg_gs)
# TCGA-ACC data:
kmeans_clusters_tcga_acc <- kmeans_gsva_metabolic(gsva_metab_tcga_acc, kegg_gs = kegg_gs)
# TCGA-UVM data:
kmeans_clusters_tcga_uvm <- kmeans_gsva_metabolic(gsva_metab_tcga_uvm, kegg_gs = kegg_gs)
# TCGA-LAML data:
kmeans_clusters_tcga_laml <- kmeans_gsva_metabolic(gsva_metab_tcga_laml, kegg_gs = kegg_gs)
# TCGA-COAD data:
kmeans_clusters_tcga_coad <- kmeans_gsva_metabolic(gsva_metab_tcga_coad, kegg_gs = kegg_gs,
    user_def_k = TRUE, k = 2)

# CPTAC-COAD data:
kmeans_clusters_cptac_coad <- kmeans_gsva_metabolic(gsva_metab_cptac_coad, kegg_gs = kegg_gs)
kmeans_clusters_cptac_hnscc$heatmap

kmeans_clusters_tcga_acc$heatmap

kmeans_clusters_tcga_uvm$heatmap

kmeans_clusters_tcga_laml$heatmap

kmeans_clusters_tcga_coad$heatmap

kmeans_clusters_cptac_coad$heatmap

1.5 Barplot of mean pathway activity per cluster

plot_mean_pathway_activity(gsva_data = gsva_metab_cptac_hnscc, kegg_gs = kegg_gs,
    kmeans_res = kmeans_clusters_cptac_hnscc$kmean_res)

plot_mean_pathway_activity(gsva_data = gsva_metab_tcga_acc, kegg_gs = kegg_gs, kmeans_res = kmeans_clusters_tcga_acc$kmean_res)

plot_mean_pathway_activity(gsva_data = gsva_metab_tcga_uvm, kegg_gs = kegg_gs, kmeans_res = kmeans_clusters_tcga_uvm$kmean_res)

plot_mean_pathway_activity(gsva_data = gsva_metab_tcga_laml, kegg_gs = kegg_gs, kmeans_res = kmeans_clusters_tcga_laml$kmean_res)

plot_mean_pathway_activity(gsva_data = gsva_metab_tcga_coad, kegg_gs = kegg_gs, kmeans_res = kmeans_clusters_tcga_coad$kmean_res)

plot_mean_pathway_activity(gsva_data = gsva_metab_cptac_coad, kegg_gs = kegg_gs,
    kmeans_res = kmeans_clusters_cptac_coad$kmean_res)

1.6 PROGENy analysis and comparison of pathways between the clusters

From the website of PROGENy: “PROGENy is resource that leverages a large compendium of publicly available signaling perturbation experiments to yield a common core of pathway responsive genes for human and mouse. These, coupled with any statistical method, can be used to infer pathway activities from bulk or single-cell transcriptomics.”

# CPTAC-HNSC data:
progeny_cptac_hnsc <- run_progeny(gene_exp_data = gene_exp_cptac_hnscc, kmeans_res = kmeans_clusters_cptac_hnscc$kmean_res)
progeny_cptac_hnsc$progeny_boxplot

# TCGA-ACC data:
progeny_tcga_acc <- run_progeny(gene_exp_data = gene_exp_tcga_acc, kmeans_res = kmeans_clusters_tcga_acc$kmean_res)
progeny_tcga_acc$progeny_boxplot

# TCGA-UVM data:
progeny_tcga_uvm <- run_progeny(gene_exp_data = gene_exp_tcga_uvm, kmeans_res = kmeans_clusters_tcga_uvm$kmean_res)
progeny_tcga_uvm$progeny_boxplot

# TCGA-LAML data:
progeny_tcga_lam <- run_progeny(gene_exp_data = gene_exp_tcga_laml, kmeans_res = kmeans_clusters_tcga_laml$kmean_res)
progeny_tcga_lam$progeny_boxplot

# TCGA-COAD data:
progeny_tcga_coad <- run_progeny(gene_exp_data = gene_exp_tcga_coad, kmeans_res = kmeans_clusters_tcga_coad$kmean_res)
progeny_tcga_coad$progeny_boxplot

# CPTAC-COAD data:
progeny_cptac_coad <- run_progeny(gene_exp_data = gene_exp_cptac_coad, kmeans_res = kmeans_clusters_cptac_coad$kmean_res)
progeny_cptac_coad$progeny_boxplot

1.7 Kaplan-Meier analysis of the identified metabolic clusters

# CPTAC-HNSC data:
kmeans_metab_clust_surv(kmeans_res = kmeans_clusters_cptac_hnscc$kmean_res, clinical_data = clinical_cptac_hnscc,
    sample_id_col = "case_id", surv_status_col = "overall_free_status", surv_time_col = "overall_survival")

# TCGA-ACC data:
kmeans_metab_clust_surv(kmeans_res = kmeans_clusters_tcga_acc$kmean_res, clinical_data = clinical_cptac_acc,
    sample_id_col = "case_id", surv_status_col = "overall_free_status", surv_time_col = "overall_survival")

# TCGA-UVM data:
kmeans_metab_clust_surv(kmeans_res = kmeans_clusters_tcga_uvm$kmean_res, clinical_data = clinical_cptac_uvm,
    sample_id_col = "case_id", surv_status_col = "overall_free_status", surv_time_col = "overall_survival")

# TCGA-LAML data:
kmeans_metab_clust_surv(kmeans_res = kmeans_clusters_tcga_laml$kmean_res, clinical_data = clinical_cptac_laml,
    sample_id_col = "case_id", surv_status_col = "overall_free_status", surv_time_col = "overall_survival")

# TCGA-COAD data:
kmeans_metab_clust_surv(kmeans_res = kmeans_clusters_tcga_coad$kmean_res, clinical_data = clinical_tcga_coad,
    sample_id_col = "case_id", surv_status_col = "overall_free_status", surv_time_col = "overall_survival")

sessionInfo()
## R version 4.3.3 (2024-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/Berlin
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] MetabolicExpressR_0.0.1.0 fgsea_1.25.2             
##  [3] lubridate_1.9.2           forcats_1.0.0            
##  [5] stringr_1.5.0             dplyr_1.1.2              
##  [7] purrr_1.0.1               readr_2.1.4              
##  [9] tidyr_1.3.0               tibble_3.2.1             
## [11] ggplot2_3.4.2             tidyverse_2.0.0          
## [13] BiocStyle_2.28.1         
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3          rstudioapi_0.14            
##   [3] jsonlite_1.8.4              shape_1.4.6                
##   [5] magrittr_2.0.3              magick_2.8.0               
##   [7] farver_2.1.1                rmarkdown_2.21             
##   [9] GlobalOptions_0.1.2         zlibbioc_1.45.0            
##  [11] vctrs_0.6.2                 Cairo_1.6-1                
##  [13] memoise_2.0.1               DelayedMatrixStats_1.22.0  
##  [15] RCurl_1.98-1.12             rstatix_0.7.2              
##  [17] htmltools_0.5.5             S4Arrays_1.0.4             
##  [19] broom_1.0.4                 Rhdf5lib_1.22.0            
##  [21] rhdf5_2.44.0                sass_0.4.5                 
##  [23] bslib_0.4.2                 plyr_1.8.8                 
##  [25] zoo_1.8-12                  cachem_1.0.7               
##  [27] commonmark_1.9.0            lifecycle_1.0.3            
##  [29] iterators_1.0.14            pkgconfig_2.0.3            
##  [31] rsvd_1.0.5                  Matrix_1.6-3               
##  [33] R6_2.5.1                    fastmap_1.1.1              
##  [35] GenomeInfoDbData_1.2.10     MatrixGenerics_1.11.1      
##  [37] clue_0.3-64                 digest_0.6.31              
##  [39] colorspace_2.1-0            AnnotationDbi_1.61.2       
##  [41] S4Vectors_0.37.7            DESeq2_1.39.8              
##  [43] irlba_2.3.5.1               GenomicRanges_1.51.4       
##  [45] RSQLite_2.3.1               ggpubr_0.6.0               
##  [47] beachmat_2.16.0             labeling_0.4.2             
##  [49] km.ci_0.5-6                 fansi_1.0.4                
##  [51] timechange_0.2.0            abind_1.4-5                
##  [53] httr_1.4.5                  compiler_4.3.3             
##  [55] bit64_4.0.5                 withr_2.5.0                
##  [57] doParallel_1.0.17           backports_1.4.1            
##  [59] BiocParallel_1.33.12        viridis_0.6.3              
##  [61] carData_3.0-5               DBI_1.1.3                  
##  [63] highr_0.10                  HDF5Array_1.28.1           
##  [65] ggsignif_0.6.4              DelayedArray_0.26.3        
##  [67] rjson_0.2.21                tools_4.3.3                
##  [69] progeny_1.22.0              glue_1.6.2                 
##  [71] rhdf5filters_1.12.1         gridtext_0.1.5             
##  [73] grid_4.3.3                  cluster_2.1.6              
##  [75] reshape2_1.4.4              generics_0.1.3             
##  [77] gtable_0.3.3                KMsurv_0.1-5               
##  [79] tzdb_0.3.0                  survminer_0.4.9            
##  [81] data.table_1.14.8           hms_1.1.3                  
##  [83] xml2_1.3.3                  car_3.1-2                  
##  [85] BiocSingular_1.16.0         ScaledMatrix_1.8.1         
##  [87] utf8_1.2.3                  XVector_0.39.0             
##  [89] BiocGenerics_0.45.3         markdown_1.6               
##  [91] ggrepel_0.9.3               foreach_1.5.2              
##  [93] pillar_1.9.0                GSVA_1.48.0                
##  [95] splines_4.3.3               circlize_0.4.15            
##  [97] ggtext_0.1.2                lattice_0.22-5             
##  [99] survival_3.5-7              bit_4.0.5                  
## [101] annotate_1.77.0             tidyselect_1.2.0           
## [103] ComplexHeatmap_2.16.0       SingleCellExperiment_1.22.0
## [105] locfit_1.5-9.7              Biostrings_2.67.2          
## [107] knitr_1.42                  gridExtra_2.3              
## [109] bookdown_0.34               IRanges_2.33.1             
## [111] SummarizedExperiment_1.29.1 stats4_4.3.3               
## [113] xfun_0.39                   Biobase_2.59.0             
## [115] matrixStats_0.63.0          stringi_1.7.12             
## [117] yaml_2.3.7                  evaluate_0.20              
## [119] codetools_0.2-19            NbClust_3.0.1              
## [121] BiocManager_1.30.20         graph_1.78.0               
## [123] cli_3.6.1                   xtable_1.8-4               
## [125] munsell_0.5.0               jquerylib_0.1.4            
## [127] survMisc_0.5.6              Rcpp_1.0.10                
## [129] GenomeInfoDb_1.35.17        png_0.1-8                  
## [131] XML_3.99-0.14               parallel_4.3.3             
## [133] blob_1.2.4                  sparseMatrixStats_1.12.0   
## [135] bitops_1.0-7                viridisLite_0.4.1          
## [137] GSEABase_1.62.0             scales_1.2.1               
## [139] crayon_1.5.2                GetoptLong_1.0.5           
## [141] rlang_1.1.0                 cowplot_1.1.1              
## [143] fastmatch_1.1-3             KEGGREST_1.39.0            
## [145] formatR_1.14